{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import html\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>text</th>\n",
       "      <th>stance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>开放二胎</td>\n",
       "      <td>刚回家几天就迫不及待的赶到了小舅家，看着乖巧懂事的表妹和可爱的小表弟，心情格外舒畅！这个画面...</td>\n",
       "      <td>FAVOR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>俄罗斯在叙利亚的反恐行动</td>\n",
       "      <td>俄罗斯就是流氓</td>\n",
       "      <td>AGAINST</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>春节放鞭炮</td>\n",
       "      <td>#春节放鞭炮#【中央气象台首次发布烟花爆竹燃放气象指数】明天就是除夕了，年味越发浓郁。今早，...</td>\n",
       "      <td>AGAINST</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>IphoneSE</td>\n",
       "      <td>iPhoneSE貌似摄像头不外突了，普天同庆</td>\n",
       "      <td>FAVOR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>春节放鞭炮</td>\n",
       "      <td>千万人口级城市北京，原住民在庆祝第一大节日春节，大家伙都心照不宣的想到：别再给伤痕累累的家乡...</td>\n",
       "      <td>AGAINST</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         target                                               text   stance\n",
       "0          开放二胎  刚回家几天就迫不及待的赶到了小舅家，看着乖巧懂事的表妹和可爱的小表弟，心情格外舒畅！这个画面...    FAVOR\n",
       "1  俄罗斯在叙利亚的反恐行动                                            俄罗斯就是流氓  AGAINST\n",
       "2         春节放鞭炮  #春节放鞭炮#【中央气象台首次发布烟花爆竹燃放气象指数】明天就是除夕了，年味越发浓郁。今早，...  AGAINST\n",
       "3      IphoneSE                             iPhoneSE貌似摄像头不外突了，普天同庆    FAVOR\n",
       "4         春节放鞭炮  千万人口级城市北京，原住民在庆祝第一大节日春节，大家伙都心照不宣的想到：别再给伤痕累累的家乡...  AGAINST"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv('../data/train.csv', sep='\\t')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>IphoneSE</td>\n",
       "      <td>讲真，对iphoneSE很心动，但是又很期待iphone7，心塞</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>春节放鞭炮</td>\n",
       "      <td>传统春节来临传统的拜神，放鞭炮，烧纸，热闹呢</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>俄罗斯在叙利亚的反恐行动</td>\n",
       "      <td>俄罗斯在战争状态下的紧急动员能力，这不是土耳其可以低估的。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>深圳禁摩限电</td>\n",
       "      <td>珠海要是有这么高强度，市区就不会有那么多摩托车横冲直撞了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>深圳禁摩限电</td>\n",
       "      <td>#深圳禁摩限电# 早该整了，快递开电车，真把马路当成他们家开的一样...不过他们工作压力大，...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         target                                               text\n",
       "0      IphoneSE                   讲真，对iphoneSE很心动，但是又很期待iphone7，心塞\n",
       "1         春节放鞭炮                             传统春节来临传统的拜神，放鞭炮，烧纸，热闹呢\n",
       "2  俄罗斯在叙利亚的反恐行动                      俄罗斯在战争状态下的紧急动员能力，这不是土耳其可以低估的。\n",
       "3        深圳禁摩限电                       珠海要是有这么高强度，市区就不会有那么多摩托车横冲直撞了\n",
       "4        深圳禁摩限电  #深圳禁摩限电# 早该整了，快递开电车，真把马路当成他们家开的一样...不过他们工作压力大，..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df = pd.read_csv('../data/test.csv', sep='\\t')\n",
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def solve(row):\n",
    "    columns = ['target', 'text']\n",
    "    ans = row['target'] + row['text']\n",
    "    return ans    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['text'] = train_df.apply(lambda row: solve(row), axis=1)\n",
    "test_df['text'] = test_df.apply(lambda row: solve(row), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "319\n"
     ]
    }
   ],
   "source": [
    "texts = train_df.text.tolist()\n",
    "max_len = 0\n",
    "df_len_list = []\n",
    "for i in range(len(texts)):\n",
    "    text = []\n",
    "    text = texts[i]\n",
    "    df_len_list.append(len(text))\n",
    "    max_len = max(max_len, len(text))\n",
    "print(max_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "222"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted_len = sorted(df_len_list)\n",
    "sorted_len[int(len(sorted_len)*0.999)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AGAINST    989\n",
       "FAVOR      912\n",
       "NONE       499\n",
       "Name: stance, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.stance.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
